Set up

using <- function(...) {
    libs <- unlist(list(...))
    need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
    if(length(need) > 0){ 
        install.packages(need, repos = "https://cloud.r-project.org")
        need <- need[!unlist(lapply(need, require, character.only = T))]
        if (length(need) > 0) {
          if (!requireNamespace("BiocManager", quietly = T))
            install.packages("BiocManager")
          BiocManager::install(need)
        }
    }
    lapply(libs, require, character.only = T)
}
using("data.table", "tidyverse", "Seurat", "patchwork")

MacCarroll retina

Here we will analyze single-cell RNA-seq data from mouse retina, obtained at postnatal day 14. Fresh tissue was dissected, dissociated, and the cell suspension was immediately subjected to the Drop-seq protocol for cell capture and sequencing.

We will start from the Drop-seq output (a matrix of read counts per gene, per cell) of the MacCarrol sample, and perform basic quality control, dimensionality reduction and clustering. We will then analyze the cell populations obtained, infer their cell-type identity and obtain their gene signatures.

Import

Load gene count estimates per cell, and build the basic Seurat object we’ll be using for analysis.

set.seed(42)
DGEmatrix <- fread('Dropseq_p14_retina_Mccarroll.txt', sep = "\t") %>% 
  data.frame(row.names = 1)

# create our base Seurat object, discarding cells that have less than 100 genes
# and discarding genes that are expressed in less than 3 cells. 
Seurat_object <- CreateSeuratObject(DGEmatrix, assay = "RNA",
                                    min.cells = 3, min.features = 100,
                                    project = "Mccarroll_retina")

# number of genes and cells
dim(DGEmatrix)
## [1] 26058  6120

# check DGE matrix
DGEmatrix[1:3, 1:3]
NORM.P14.WR.Mccarrol_r3_GTACTTAATTTC NORM.P14.WR.Mccarrol_r3_CCGGAGGTTAGG NORM.P14.WR.Mccarrol_r3_CTGTCCCCCTAT
0610005C13RIK 0 0 0
0610007N19RIK 0 0 0
0610007P14RIK 0 2 1

QC

We will compute some QC metrics that reflect the overall quality of the sample: coverage (# of genes, # of UMIs) and mitochondrial read proportions – which is often an indicator of stress/cell damage

# get a list of mitochondrial genes: all genes starting with 'MT'
mito.genes <- grep("^MT-", rownames(x = Seurat_object@assays$RNA), value = TRUE)

# compute proportions
percent.mito <- Matrix::colSums(Seurat_object@assays$RNA[mito.genes,]) /
  Matrix::colSums(Seurat_object@assays$RNA)

# add the information back to the seurat object
Seurat_object <- AddMetaData(object = Seurat_object, metadata = percent.mito,
                             col.name = "percent.mito")

# get a list of crystal genes (all genes starting by 'CRY') to remove crystalin contamination
crystal.genes <- grep("^CRY[AB]", rownames(x = Seurat_object@assays$RNA), value = TRUE)

# compute proportions
percent.crystal <- Matrix::colSums(Seurat_object@assays$RNA[crystal.genes, ]) / 
  Matrix::colSums(Seurat_object@assays$RNA)

# add the information back to the seurat object
Seurat_object <- AddMetaData(object = Seurat_object, metadata = percent.crystal,
                             col.name = "percent.crystal")

# display distribution of some metrics:
# # of genes, # of UMIs, and mitochondrial/crystal proportion
fts <- c("nFeature_RNA", "nCount_RNA", "percent.mito", "percent.crystal")
VlnPlot(object = Seurat_object, ncol = 4, pt.size = 0, features = fts)


# Get some summary stats for the sample:
summary_stats_before_filtering <- tibble(
  total_cells  = nrow(Seurat_object@meta.data),
  mean_n_genes = mean(Seurat_object@meta.data$nFeature_RNA),
  sd_n_genes   = sd(Seurat_object@meta.data$nFeature_RNA),
  max_n_genes  = max(Seurat_object@meta.data$nFeature_RNA),
  min_n_genes  = min(Seurat_object@meta.data$nFeature_RNA),
  mean_UMI     = mean(Seurat_object@meta.data$nCount_RNA),
  sd_UMI       = sd(Seurat_object@meta.data$nCount_RNA),
  max_UMI      = max(Seurat_object@meta.data$nCount_RNA),
  min_UMI      = min(Seurat_object@meta.data$nCount_RNA)
) %>% mutate_all(function(x) round(x, 2))

summary_stats_before_filtering
total_cells mean_n_genes sd_n_genes max_n_genes min_n_genes mean_UMI sd_UMI max_UMI min_UMI
6119 530.76 544.2 6428 107 829.92 1272.01 25948 219

Filtering

We saw that the distribution of some metrics showed some low quality cells (often appear as bimodal distributions for the number of genes and the mitochondrial proportions, and the very long tails). Although those are not many cells, they can bother for clustering and cell-type identification. Let’s get rid of the worst offenders, and recompute QC metrics

# We'll define some cutoffs to remove cells that deviate too much from the rest of the
# cells in the sample. These can be hard cut-offs or dataset-specific ones based on SD.
# Sometimes those represent multiplets, or just abnormal cells like crystalin cells
Seurat_object <- subset(Seurat_object,  subset = nFeature_RNA > 100 & nFeature_RNA < 6000 & 
                          nCount_RNA < 10000 & percent.mito < 0.10 & percent.crystal < 0.025)
Seurat_object <- subset(Seurat_object,  subset = nFeature_RNA > 100 & nFeature_RNA < 6000 & 
                          nCount_RNA < 10000 & percent.mito < 0.10 & percent.crystal < 0.025)

VlnPlot(object = Seurat_object, pt.size = 0, ncol = 4, features = fts)


# Get some summary stats for the sample:
summary_stats_after_filtering <- tibble(
  total_cells  = nrow(Seurat_object@meta.data),
  mean_n_genes = mean(Seurat_object@meta.data$nFeature_RNA),
  sd_n_genes   = sd(Seurat_object@meta.data$nFeature_RNA),
  max_n_genes  = max(Seurat_object@meta.data$nFeature_RNA),
  min_n_genes  = min(Seurat_object@meta.data$nFeature_RNA),
  mean_UMI     = mean(Seurat_object@meta.data$nCount_RNA),
  sd_UMI       = sd(Seurat_object@meta.data$nCount_RNA),
  max_UMI      = max(Seurat_object@meta.data$nCount_RNA),
  min_UMI      = min(Seurat_object@meta.data$nCount_RNA)
) %>% mutate_all(function(x) round(x, 2))

summary_stats_after_filtering
total_cells mean_n_genes sd_n_genes max_n_genes min_n_genes mean_UMI sd_UMI max_UMI min_UMI
5858 527.88 493.86 4012 135 799.97 962.52 9956 219

print(
  paste0("Number of filtered cells: ",
         summary_stats_before_filtering$total_cells -
           summary_stats_after_filtering$total_cells
  )
)
## [1] "Number of filtered cells: 261"

Not much was removed, only a few outlier cells, because this was a quite good quality data to begin with. In other datasets, the before/after QC metrics will look much more different.


Normalization

# before proceeding, we note markers for cell cycle
Seurat_object <- CellCycleScoring(Seurat_object, set.ident = FALSE,
                                  s.features = cc.genes$s.genes,
                                  g2m.features = cc.genes$g2m.genes)

# we then go on to apply a transforms the data and regresses out unwanted variation
vs <- c("nFeature_RNA", "percent.mito", "S.Score", "G2M.Score")
Seurat_object <- SCTransform(Seurat_object, verbose = T,
                             vars.to.regress = vs)

FeatureScatter(Seurat_object, feature1 = "nCount_RNA", feature2 = "percent.mito") +
  FeatureScatter(Seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") &
  theme(legend.position = 'none')

Dimension reduction

PCA

We will perform a first dimension reduction by using only the most variable genes. Then, we’ll transform the data using principal component analysis (PCA), and use only the first few components for downstream analyses.

# a bit of wrangling to prepare for VariableFeaturePlot
Seurat_object[['SCT']]@meta.features <- SCTResults(Seurat_object[['SCT']], slot = 'feature.attributes')[, c('gmean', 'variance', 'residual_variance')]
Seurat_object[['SCT']]@meta.features$variable <- F
Seurat_object[['SCT']]@meta.features[VariableFeatures(Seurat_object[['SCT']]), 'variable'] <- F
colnames(Seurat_object[["SCT"]]@meta.features) <- paste0("sct.", colnames(Seurat_object[["SCT"]]@meta.features))

# label top 10 most variable features
VariableFeaturePlot(Seurat_object, selection.method = 'sct', assay = 'SCT') %>%
  LabelPoints(points = head(VariableFeatures(Seurat_object), 10), repel = T) &
  theme(legend.position = 'none')


# perform PCA
Seurat_object <- RunPCA(Seurat_object, pcs.compute = 100, do.print = F)

# display genes correlated with PCs 1 & 2
VizDimLoadings(Seurat_object, dims = 1:2, reduction = "pca")


# alternative representation of genes highly correlated with PC1
DimHeatmap(Seurat_object, dims = 1:15, cells = 500, balanced = T)


# inspect the standard deviation of each PC
ElbowPlot(Seurat_object)


# assess effect of cell cycle
DimPlot(Seurat_object, reduction = "pca", group.by = "Phase")

Non-linear approaches

Seurat_object <- RunUMAP(Seurat_object, dims = 1:20)
Seurat_object <- RunTSNE(Seurat_object, dims = 1:20)

Clustering

# cluster the cells based on UMAP reduction
Seurat_object <- FindNeighbors(Seurat_object, 
                               dims = 1:2, 
                               reduction = "umap")
# the resolution can be adjusted to tweak clustering accuracy
# usually somewhere around resolution = 1 
Seurat_object <- FindClusters(Seurat_object, 
                              resolution = 0.5, 
                              reduction = "umap")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5858
## Number of edges: 127309
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9411
## Number of communities: 21
## Elapsed time: 0 seconds

UMAP

DimPlot(Seurat_object, reduction = "umap", label = T)

tSNE

DimPlot(Seurat_object, reduction = "tsne", label = T)

Metrics

VlnPlot(object = Seurat_object, pt.size = 0.1, ncol = 1, features = fts)


Identifying cell types

We now have some clusters of cells (cell populations), but we still don’t know what these cells are. One can apply differential expression analysis to find marker genes higher expressed in every given cluster as compared to all remaining cells, and then infer the cell type based on current knowledge on those genes.

Seurat_object.markers <- FindAllMarkers(Seurat_object, only.pos = TRUE,
                                        min.pct = 0.25, logfc.threshold = 0.25)
head(Seurat_object.markers)
p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
RPGRIP1 0 0.8758899 0.883 0.527 0 0 RPGRIP1
A930011O12RIK 0 0.7883033 0.800 0.434 0 0 A930011O12RIK
RP1 0 0.7193798 0.913 0.595 0 0 RP1
MALAT1 0 0.6033796 0.988 0.874 0 0 MALAT1
SAMD11 0 0.4329792 0.559 0.315 0 0 SAMD11
PEX5L 0 0.3805290 0.526 0.297 0 0 PEX5L

We can take a look at the top 10 markers in each cluster

top10 <- Seurat_object.markers %>% 
  group_by(cluster) %>% 
  slice_max(avg_log2FC, n = 10) %>%
  ungroup()
DotPlot(
  Seurat_object,
  assay = NULL,
  unique(top10$gene),
  cols = c("blue", "red"),
  col.min = -2.5,
  col.max = 2.5,
  dot.min = 0,
  dot.scale = 6,
  group.by = NULL,
  split.by = NULL,
  scale.by = "radius",
  scale.min = NA,
  scale.max = NA
) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))

Alternatively, one could directly use known marker genes or published expression profiles of purified / sorted cells.

markers.retina.dotplot <- toupper(c(
  "Rho",  #Rods
  "Opn1mw", #Cones
  "Trpm1", #Bipolar cells
  "Scgn", #Bipolar cells
  "Celf4", #Amacrine cells
  "Rgs5", #Pericytes
  "Cldn5", #Endothelial cells
  "Lyz2", #Immune cells
  "Glul", # Muller glia
  "Gfap", #Astrocytes
  "Optc", #Lens cells
  "Lhx1"  #Horizontal cells
))

Example

DefaultAssay(Seurat_object) <- 'RNA'
FeaturePlot(Seurat_object, features = 'TRPM1', reduction = 'umap')

Dot plots

DefaultAssay(Seurat_object) <- 'SCT'
DotPlot(
  Seurat_object,
  assay = NULL,
  markers.retina.dotplot,
  cols = c("blue", "red"),
  col.min = -2.5,
  col.max = 2.5,
  dot.min = 0,
  dot.scale = 6,
  group.by = NULL,
  split.by = NULL,
  scale.by = "radius",
  scale.min = NA,
  scale.max = NA
) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))

Violin plots

DefaultAssay(Seurat_object) <- 'SCT'
VlnPlot(object = Seurat_object, features = markers.retina.dotplot, pt.size = 0.1, stack=TRUE)


Annotation

  1. Assign as many cluster labels as you can based on the provided information
DefaultAssay(Seurat_object) <- 'SCT'
Seurat_object <- SetIdent(Seurat_object, cells = NULL, value="seurat_clusters")
new.cluster.ids <- c("",     # cluster 0
                     "",     # cluster 1
                     "",     # cluster 2
                     "",     # cluster 3
                     "",     # cluster 4
                     "",     # cluster 5
                     "",     # cluster 6
                     "",     # cluster 7
                     "",     # cluster 8
                     "",     # cluster 9
                     "",     # cluster 10
                     "",     # cluster 11
                     "",     # cluster 12
                     "",     # cluster 13
                     "",     # cluster 14
                     "",     # cluster 15
                     "",     # cluster 16
                     "",     # cluster 17
                     "",     # cluster 18
                     "",     # cluster 19
                     ""      # cluster 20
)
names(new.cluster.ids) <- levels(Seurat_object)
Seurat_object <- RenameIdents(Seurat_object, new.cluster.ids)

Seurat_object[["Cell_Type"]] <- Idents(object = Seurat_object)

DimPlot(Seurat_object, reduction = "umap", label = T)

Joyal retina

Now that you know how to process single-cell samples, perform QC, identify cell populations and assign identities, you are ready to analyze a dataset on your own! Together with the MacCarroll sample we just analyzed, we have another mouse retinal sample from the Joyal Lab. Instead of repeating the same thing we’ll start with pre-computed Seurat objects.

Merging

# import MacCarroll dataset
Seurat_object_MacCarroll <- readRDS("Seurat_object_MacCarroll.rds")
Seurat_object_MacCarroll[["Dataset"]] <- "MacCarroll"

# import Joyal dataset
Seurat_object_Joyal <- readRDS("Seurat_object_Joyal.rds")
Seurat_object_Joyal[["Dataset"]] <- "Joyal"

# merge the 2 dataset
Seurat_object <- merge(Seurat_object_MacCarroll,
                       y = Seurat_object_Joyal,
                       add.cell.ids = NULL,
                       project = "Lab_integration")
Seurat_object.list <- SplitObject(Seurat_object, split.by = "Dataset") %>%
  lapply(SCTransform, verbose = F, vars.to.regress = vs)

Integration

# select most consistently variable features
Seurat_object.features <- SelectIntegrationFeatures(object.list = Seurat_object.list,
                                                    nfeatures = 3000)

# some data wrangling
Seurat_object.list <- PrepSCTIntegration(object.list = Seurat_object.list,
                                         anchor.features = Seurat_object.features)

# identify anchors shared by the datasets
Seurat_object.anchors <- FindIntegrationAnchors(object.list = Seurat_object.list,
                                                normalization.method = "SCT", 
                                                anchor.features = Seurat_object.features)

# proceed with integration
Seurat_object <- IntegrateData(anchorset = Seurat_object.anchors,
                               normalization.method = "SCT")

Dimension reduction

Seurat_object <- RunPCA(Seurat_object) %>%
  RunUMAP(dims = 1:20) %>%
  RunTSNE(dims = 1:20)
  1. Which gene is most correlated with PC1? Is it the same as when we looked at the first dataset alone? What about PC2?

Clustering

Seurat_object <- FindNeighbors(Seurat_object, 
                               dims = 1:2, 
                               reduction = "umap")
Seurat_object <- FindClusters(Seurat_object, 
                              resolution = 0.5, 
                              reduction = "umap")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8393
## Number of edges: 183579
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9515
## Number of communities: 29
## Elapsed time: 0 seconds

UMAP

Together

DimPlot(Seurat_object, reduction = "umap", label=TRUE)

Split

DimPlot(Seurat_object, reduction = "umap", label=TRUE, split.by="Dataset")

tSNE

Together

DimPlot(Seurat_object, reduction = "tsne", label=TRUE)

Split

DimPlot(Seurat_object, reduction = "tsne", label=TRUE, split.by="Dataset")


Identifying cell types

We could again identify markers genes as before

DefaultAssay(Seurat_object) <- "SCT"
Seurat_object.markers <- FindAllMarkers(Seurat_object, only.pos = TRUE,
                                        min.pct = 0.25, logfc.threshold = 0.25)
head(Seurat_object.markers)
p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
GNGT1 0 1.0487288 0.993 0.674 0 0 GNGT1
RCVRN 0 0.9475006 0.944 0.514 0 0 RCVRN
RHO 0 1.0020691 0.996 0.647 0 0 RHO
PDE6G 0 0.9574784 0.962 0.545 0 0 PDE6G
PDC 0 0.7846371 0.984 0.612 0 0 PDC
ROM1 0 0.6998536 0.769 0.387 0 0 ROM1
top10 <- Seurat_object.markers %>% 
  group_by(cluster) %>% 
  slice_max(avg_log2FC, n = 10) %>%
  ungroup()
DotPlot(
  Seurat_object,
  assay = NULL,
  unique(top10$gene),
  cols = c("blue", "red"),
  col.min = -2.5,
  col.max = 2.5,
  dot.min = 0,
  dot.scale = 6,
  group.by = NULL,
  split.by = NULL,
  scale.by = "radius",
  scale.min = NA,
  scale.max = NA
) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))

Or again adopt known markers

DotPlot(
  Seurat_object,
  assay = NULL,
  markers.retina.dotplot,
  cols = c("blue", "red"),
  col.min = -2.5,
  col.max = 2.5,
  dot.min = 0,
  dot.scale = 6,
  group.by = NULL,
  split.by = NULL,
  scale.by = "radius",
  scale.min = NA,
  scale.max = NA
) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))

Or make use of pre-existing labels annotated independently

DimPlot(Seurat_object, reduction = "umap", label = TRUE, 
        pt.size = 0.5, group.by="Cell_Type") + NoLegend()


Annotation

  1. Assign as many cluster labels as you can based on the provided information
Seurat_object <- SetIdent(Seurat_object, cells = NULL, value="seurat_clusters")
new.cluster.ids <- c("",     # cluster 0
                     "",     # cluster 1
                     "",     # cluster 2
                     "",     # cluster 3
                     "",     # cluster 4
                     "",     # cluster 5
                     "",     # cluster 6
                     "",     # cluster 7
                     "",     # cluster 8
                     "",     # cluster 9
                     "",     # cluster 10
                     "",     # cluster 11
                     "",     # cluster 12
                     "",     # cluster 13
                     "",     # cluster 14
                     "",     # cluster 15
                     "",     # cluster 16
                     "",     # cluster 17
                     "",     # cluster 18
                     "",     # cluster 19
                     "",     # cluster 20
                     "",     # cluster 21
                     "",     # cluster 22
                     "",     # cluster 23
                     "",     # cluster 24
                     "",     # cluster 25
                     "",     # cluster 26
                     "",     # cluster 27
                     ""      # cluster 28
)
names(new.cluster.ids) <- levels(Seurat_object)
Seurat_object <- RenameIdents(Seurat_object, new.cluster.ids)

Seurat_object[["Cell_Type"]] <- Idents(object = Seurat_object)

DimPlot(Seurat_object, reduction = "umap", label = T)